1 Setup

1.1 Create conda evironment

conda env create \
  -n mikk_env \
  -f mikk_genome/code/config/conda_env.yml
  
conda activate mikk_env

1.2 Load R packages

require(here)
require(tidyverse)

1.3 Create directory structure and clone repo

(Working directory on EBI cluster: /hps/research1/birney/users/ian/mikk_paper)

# move to working directory
cd /your/working/directory
# clone git repository
git clone https://github.com/Ian-Brettell/mikk_genome.git
# create directory for VCFs
mkdir vcfs

1.4 Copy MIKK panel VCF into working directory

(See supplementary material for how VCF was generated.)

cp /nfs/research1/birney/projects/medaka/inbred_panel/medaka-alignments-release-94/vcf/medaka_inbred_panel_ensembl_new_reference_release_94.vcf* vcfs

1.5 Key-value file for cram ID to line ID

mikk_genome/data/20200206_cram_id_to_line_id.txt

1.6 Remove sibling lines and replicates

Full list of 80 extant MIKK panel lines: mikk_genome/data/20200210_panel_lines_full.txt

Note: Line 130-2 is missing from the MIKK panel VCF.

Identify sibling lines

cat mikk_genome/data/20200210_panel_lines_full.txt | cut -f1 -d"-" | sort | uniq -d
  • 106
  • 11
  • 117
  • 131
  • 132
  • 135
  • 14
  • 140
  • 23
  • 39
  • 4
  • 40
  • 59
  • 69
  • 72
  • 80

Only keep first sibling line ( suffix _1); manually remove all others and write list of non-sibling lines to here: mikk_genome/data/20200227_panel_lines_no-sibs.txt. 64 lines total.

Excluded sibling lines here: mikk_genome/data/20200227_panel_lines_excluded.txt. 16 lines total.

Replace all dashes with underscores to match mikk_genome/data/20200206_cram_id_to_line_id.txt key file

sed 's/-/_/g' mikk_genome/data/20200227_panel_lines_no-sibs.txt \
  > mikk_genome/data/20200227_panel_lines_no-sibs_us.txt

Extract the lines to keep from the key file.

awk  'FNR==NR {f1[$0]; next} $2 in f1' \
  mikk_genome/data/20200227_panel_lines_no-sibs_us.txt \
  mikk_genome/data/20200206_cram_id_to_line_id.txt \
    > mikk_genome/data/20200227_cram2line_no-sibs.txt

Has 66 lines instead of 63 (64 lines minus 130-2, which isn’t in the VCF), so there must be replicates Find out which ones:

cat mikk_genome/data/20200227_cram2line_no-sibs.txt | cut -f2 | cut -f1 -d"_" | sort | uniq -d

32 71 84

Manually removed duplicate lines (mikk_genome/data/20200227_duplicates_excluded.txt):

  • 24271_7#5 32_2
  • 24271_8#4 71_1
  • 24259_1#1 84_2

Final no-sibling-lines CRAM-to-lineID key file: mikk_genome/data/20200227_cram2line_no-sibs.txt

2 Create no-sibling-lines MIKK panel VCF

# create no-sibs file with CRAM ID only
cut -f1 mikk_genome/data/20200227_cram2line_no-sibs.txt \
  > mikk_genome/data/20200227_cram2line_no-sibs_cram-only.txt
  
# make new VCF having filtered out non-MIKK and sibling lines
bcftools view \
  --output-file vcfs/panel_no-sibs.vcf \
  --samples-file mikk_genome/data/20200227_cram2line_no-sibs_cram-only.txt \
  vcfs/medaka_inbred_panel_ensembl_new_reference_release_94.vcf
  
# recode with line IDs
bcftools reheader \
  --output vcfs/panel_no-sibs_line-ids.vcf \
  --samples mikk_genome/data/20200227_cram2line_no-sibs.txt \
  vcfs/panel_no-sibs.vcf
  
# compress
bcftools view \
  --output-type z \
  --output-file vcfs/panel_no-sibs_line-ids.vcf.gz \
  vcfs/panel_no-sibs_line-ids.vcf
  
# index
bcftools index \
  --tbi \
  vcfs/panel_no-sibs_line-ids.vcf.gz

# get stats
mkdir stats

bcftools stats \
  vcfs/panel_no-sibs_line-ids.vcf.gz \
  > stats/20200305_panel_no-sibs.txt

## get basic counts
grep "^SN" stats/20200305_panel_no-sibs.txt

2.1 Make a version with no missing variants

vcftools \
  --gzvcf vcfs/panel_no-sibs_line-ids.vcf.gz \
  --max-missing 1 \
  --recode \
  --stdout > vcfs/panel_no-sibs_line-ids_no-missing.vcf
  
# compress
bcftools view \
  --output-type z \
  --output-file vcfs/panel_no-sibs_line-ids_no-missing.vcf.gz \
  vcfs/panel_no-sibs_line-ids_no-missing.vcf

# create index
bcftools index \
  --tbi vcfs/panel_no-sibs_line-ids_no-missing.vcf.gz
  
# get stats 
bcftools stats \
  vcfs/panel_no-sibs_line-ids_no-missing.vcf.gz \
  > stats/20200305_panel_no-sibs_no-missing.txt

# get basic counts
grep "^SN" stats/20200305_panel_no-sibs_no-missing.txt

3 Generate Haploview plots

3.2 Create BED sets filtered for MAF > 0.03, 0.05 and 0.10

maf_thresholds=$( echo 0.03 0.05 0.10 )

# Make new BEDs 
for i in $maf_thresholds ; do
  # make directory
  new_path=plink/20200716_panel_no-sibs_line-ids_no-missing/20200803_maf-$i ;
  # make directory
  if [ ! -d "$new_path" ]; then
    mkdir $new_path;
  fi
  # make BED set
  plink \
    --bfile plink/20200716_panel_no-sibs_line-ids_no-missing/20200716 \
    --make-bed \
    --double-id \
    --chr-set 24 no-xy \
    --maf $i \
    --out $new_path/20200803
done

3.3 Recode for Haploview

# Create output directory
mkdir plink/20200716_panel_no-sibs_line-ids_no-missing/20200803_hv_thinned

hv_thinned_path=plink/20200716_panel_no-sibs_line-ids_no-missing/20200803_hv_thinned

# Recode
for i in $maf_thresholds ; do
  new_path=$hv_thinned_path/$i ;
  # make directory
  if [ ! -d "$new_path" ]; then
    mkdir $new_path;
  fi 
  # recode 
  for j in $(seq 1 24); do
    plink \
      --bfile plink/20200716_panel_no-sibs_line-ids_no-missing/20200803_maf-$i/20200803 \
      --recode HV-1chr \
      --double-id \
      --chr-set 24 no-xy \
      --chr $j \
      --allele1234 \
      --thin-count 3000 \
      --out $hv_thinned_path/$i/20200803_chr-$j;
  done;
done

# Edit .ped files to remove asterisks
for i in $maf_thresholds ; do
  for j in $(find $hv_thinned_path/$i/20200803_chr-*.ped); do
    sed -i 's/\*/0/g' $j;
  done;
done  

# Edit .info files to make the SNP's bp position its ID
for i in $maf_thresholds; do
  for j in $(find $hv_thinned_path/$i/20200803_chr*.info); do
    outname=$(echo $j\_with-id);
    awk -v OFS="\t" {'print $2,$2'} $j > $outname;
  done;
done

3.4 Plot

NOTE: This code requires Haploview, which you will need to install on your system: https://www.broadinstitute.org/haploview/haploview

hv_path=/nfs/software/birney/Haploview.jar # edit to your Haploview path

mkdir plots/20200803_ld_thinned/

for i in $maf_thresholds; do
  # set output directory
  new_path=plots/20200803_ld_thinned/$i ;
  # make directory
  if [ ! -d "$new_path" ]; then
    mkdir $new_path;
  fi   
  for j in $(seq 1 24); do
    bsub -M 20000 -o log/20200803_hv_$i\_$j.out -e log/20200803_hv_$i\_$j.err \
    "java -Xms18G -Xmx18G -jar $hv_path \
      -memory 18000 \
      -pedfile $hv_thinned_path/$i/20200803_chr-$j.ped  \
      -info $hv_thinned_path/$i/20200803_chr-$j.info_with-id \
      -maxDistance 1000 \
      -ldcolorscheme DEFAULT \
      -ldvalues RSQ \
      -minMAF $i \
      -nogui \
      -svg \
      -out $new_path/$j";
  done;
done

These svg files can be converted to pdf using: * https://www.zamzar.com/ for files > 30 MB (chr 1) - note limit on number of files you can convert * https://onlineconvertfree.com/convert-format/svg-to-pdf/ for the rest

The full Haploview LD plots are available in the Supplementary Material.

By inspecting these LD plots at the MAF > 0.05 level, we discovered the following LD blocks worthy of further investigation:

  • 5:28181970-28970558 (788 Kb)
  • 6:29398579-32246747 (2.85 Mb)
  • 12:25336174-25384053 (48 Kb)
  • 14:12490842-12947083 (456 Kb)
  • 17:15557892-19561518 (4 Mb)
  • 21:6710074-7880374 (1.17 Mb)

See zoomed plots here:

5:28181970-28970558

6:29398579-32246747

12:25336174-25384053 14:12490842-129470833

17:15557892-19561518

hv_21_6710074-7880374

4 LD decay

4.1 Obtain

# make BED
mkdir plink/20200727_mikk_no-missing_maf-0.05

plink \
  --vcf vcfs/panel_no-sibs_line-ids.vcf.gz \
  --make-bed \
  --double-id \
  --snps-only \
  --biallelic-only \
  --maf 0.05 \
  --geno 0 \
  --chr-set 24 no-xy \
  --out plink/20200727_mikk_no-missing_maf-0.05/20200727

# get LD   
mkdir ld/20200727_mikk_maf-0.10_window-50kb_no-missing/

for i in $(seq 1 24); do
  plink \
      --bfile plink/20200727_mikk_no-missing_maf-0.05/20200727 \
      --r2 \
      --ld-window 999999 \
      --ld-window-kb 50 \
      --ld-window-r2 0 \
      --chr-set 24 no-xy \
      --chr $i \
      --maf 0.10 \
      --out ld/20200727_mikk_maf-0.10_window-50kb_no-missing/$i;
done

# for 1KG too
mkdir ld/20200727_1kg_maf-0.10_window-50kb_no-missing/

for i in $(seq 1 22); do
  plink \
      --bfile plink/20200723_1gk_no-missing_maf-0.05/20200723 \
      --r2 \
      --ld-window 999999 \
      --ld-window-kb 50 \
      --ld-window-r2 0 \
      --chr $i \
      --maf 0.10 \
      --out ld/20200727_1kg_maf-0.10_window-50kb_no-missing/$i;
done

# do again with ld-window-kb 10 to get counts of comparisons for paper
# get LD   
mkdir ld/20200803_mikk_maf-0.10_window-10kb_no-missing/

for i in $(seq 1 24); do
  plink \
      --bfile plink/20200727_mikk_no-missing_maf-0.05/20200727 \
      --r2 \
      --ld-window 999999 \
      --ld-window-kb 10 \
      --ld-window-r2 0 \
      --chr-set 24 no-xy \
      --chr $i \
      --maf 0.10 \
      --out ld/20200803_mikk_maf-0.10_window-10kb_no-missing/$i;
done

# for 1KG too
mkdir ld/20200803_1kg_maf-0.10_window-10kb_no-missing/

for i in $(seq 1 22); do
  plink \
      --bfile plink/20200723_1gk_no-missing_maf-0.05/20200723 \
      --r2 \
      --ld-window 999999 \
      --ld-window-kb 10 \
      --ld-window-r2 0 \
      --chr $i \
      --maf 0.10 \
      --out ld/20200803_1kg_maf-0.10_window-10kb_no-missing/$i;
done

# Get counts
wc -l ld/20200803_mikk_maf-0.10_window-10kb_no-missing/*.ld
# Total: 
wc -l ld/20200803_1kg_maf-0.10_window-10kb_no-missing/*.ld